In [ ]:
import pandas as pd
import os
import sklearn as skl
import nltk
import re
In [ ]:
def readData(filename):
cwd = os.getcwd()
path = cwd + "/" + filename;
print path
df =pd.read_csv(path);
return df
Function to remove htmls, urls , mentions and trailing hashes
In [ ]:
def cleanhtml(tweet):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', tweet)
return cleantext
def cleanUrl(tweet):
tweet= re.sub(r"http\S+", "", tweet)
return tweet;
def removeMention(tweet):
tweet = tweet.replace("@","").rstrip()
return tweet;
def removeTrailingHash(tweet):
if len(tweet.split()) ==1:
return tweet;
ends_with_hash=tweet.rsplit(' ', 1)[1].startswith("#")
while(ends_with_hash):
tweet=tweet.rstrip().rsplit(' ', 1)[0]
split_tweet = tweet.rsplit(' ',1)
ends_with_hash=len(split_tweet) >1
if(ends_with_hash):
ends_with_hash = ends_with_hash & split_tweet[1].startswith("#")
return tweet;
In [ ]:
def preprocess(filename):
df = readData(filename)
df['text']=df['text'].apply(cleanhtml).apply(cleanUrl).apply(removeMention).apply(removeTrailingHash);
tweetList = df['text']
return df
In [ ]:
#to test
#filename = "clinton-50k.csv"
#df = preprocess(filename)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: